Create factor variables for origin.

#install.packages("ISLR")
library("ISLR")
## 
## Attaching package: 'ISLR'
## The following object is masked _by_ '.GlobalEnv':
## 
##     Auto
## The following objects are masked from 'package:ISLR2':
## 
##     Auto, Credit
#install.packages("SmartEDA")
library("SmartEDA")
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## Load sample dataset from ISLR pacakge
Carseats= ISLR::Carseats
# Overview of the data - Type = 1
ExpData(data=Auto,type=1)
##                                           Descriptions    Value
## 1                                   Sample size (nrow)      392
## 2                              No. of variables (ncol)        9
## 3                    No. of numeric/interger variables        8
## 4                              No. of factor variables        1
## 5                                No. of text variables        0
## 6                             No. of logical variables        0
## 7                          No. of identifier variables        0
## 8                                No. of date variables        0
## 9             No. of zero variance variables (uniform)        0
## 10               %. of variables having complete cases 100% (9)
## 11   %. of variables having >0% and <50% missing cases   0% (0)
## 12 %. of variables having >=50% and <90% missing cases   0% (0)
## 13          %. of variables having >=90% missing cases   0% (0)
# Structure of the data - Type = 2
ExpData(data=Auto,type=2)
##   Index Variable_Name Variable_Type Sample_n Missing_Count Per_of_Missing
## 1     1           mpg       numeric      392             0              0
## 2     2     cylinders       integer      392             0              0
## 3     3  displacement       numeric      392             0              0
## 4     4    horsepower       integer      392             0              0
## 5     5        weight       integer      392             0              0
## 6     6  acceleration       numeric      392             0              0
## 7     7          year       integer      392             0              0
## 8     8        origin       integer      392             0              0
## 9     9          name        factor      392             0              0
##   No_of_distinct_values
## 1                   127
## 2                     5
## 3                    81
## 4                    93
## 5                   346
## 6                    95
## 7                    13
## 8                     3
## 9                   301
# Metadata Information with additional statistics like mean, median and variance
ExpData(data=Auto,type=2, fun = c("mean", "median", "var"))
##   Index Variable_Name Variable_Type Sample_n Missing_Count Per_of_Missing
## 1     1           mpg       numeric      392             0              0
## 2     2     cylinders       integer      392             0              0
## 3     3  displacement       numeric      392             0              0
## 4     4    horsepower       integer      392             0              0
## 5     5        weight       integer      392             0              0
## 6     6  acceleration       numeric      392             0              0
## 7     7          year       integer      392             0              0
## 8     8        origin       integer      392             0              0
## 9     9          name        factor      392             0              0
##   No_of_distinct_values    mean  median       var
## 1                   127   23.45   22.75     60.92
## 2                     5    5.47    4.00      2.91
## 3                    81  194.41  151.00  10950.37
## 4                    93  104.47   93.50   1481.57
## 5                   346 2977.58 2803.50 721484.71
## 6                    95   15.54   15.50      7.61
## 7                    13   75.98   76.00     13.57
## 8                     3    1.58    1.00      0.65
## 9                   301      NA      NA        NA
# Derive Quantile 
quantile_10 = function(x){
  quantile_10 = quantile(x, na.rm = TRUE, 0.1)
}

quantile_90 = function(x){
  quantile_90 = quantile(x, na.rm = TRUE, 0.9)
}

output_e1 <- ExpData(data=Auto, type=2, fun=c("quantile_10", "quantile_90"))

output_e1
##   Index Variable_Name Variable_Type Sample_n Missing_Count Per_of_Missing
## 1     1           mpg       numeric      392             0              0
## 2     2     cylinders       integer      392             0              0
## 3     3  displacement       numeric      392             0              0
## 4     4    horsepower       integer      392             0              0
## 5     5        weight       integer      392             0              0
## 6     6  acceleration       numeric      392             0              0
## 7     7          year       integer      392             0              0
## 8     8        origin       integer      392             0              0
## 9     9          name        factor      392             0              0
##   No_of_distinct_values quantile_10 quantile_90
## 1                   127          14       34.19
## 2                     5           4        8.00
## 3                    81          90      350.00
## 4                    93          67      157.70
## 5                   346        1990     4277.60
## 6                    95          12       19.00
## 7                    13          71       81.00
## 8                     3           1        3.00
## 9                   301          NA          NA
#Graphical Representation of all numerical features 

#Density plot (Univariate)
# Note: Variable excluded (if unique value of variable which is less than or eaual to 10 [nlim=10])
plot1 <- ExpNumViz(Auto,target=NULL,nlim=10,Page=c(2,2),sample=4)
plot1[[1]]

#frequency for all categorical independent variables (origin)
ExpCTable(Auto,Target=NULL,margin=1,clim=10,nlim=3,round=2,bin=NULL,per=T)
##   Variable Valid Frequency Percent CumPercent
## 1   origin     1       245   62.50      62.50
## 2   origin     2        68   17.35      79.85
## 3   origin     3        79   20.15     100.00
## 4   origin TOTAL       392      NA         NA
#Bar Plot for all categorical variables (cylinders and origin)
plot2 <- ExpCatViz(Auto,target=NULL,col ="slateblue4",clim=10,margin=2,Page = c(2,2))
plot2[[1]]

#Graphical representation of all numeric variables
#Scatter plot between all numeric variables and target variable mpg. This plot help to examine how well a target variable is correlated with dependent variables.

#Dependent variable is mpg(continuous).

#Note: sample=8 means randomly selected 8 scatter plots
#Note: nlim=4 means included numeric variable with unique value is more than 4
plot3 <- ExpNumViz(Auto,target="mpg",nlim=4,scatter=FALSE,fname=NULL,col="green",Page=c(2,2))
plot3[[1]]

#Box plots for all numerical variables vs categorical dependent variable - Bivariate comparision only with categories
#Boxplot for all the numeric attributes by each category of origin

plot4 <- ExpNumViz(Auto,target="origin",type=1,nlim=3,fname=NULL,col=c("darkgreen","springgreen3","springgreen1"),Page=c(2,2))
## Insufficient values in colour, number of colours should be equal to number of categories
## Insufficient values in colour, number of colours should be equal to number of categories
## Insufficient values in colour, number of colours should be equal to number of categories
## Insufficient values in colour, number of colours should be equal to number of categories
## Insufficient values in colour, number of colours should be equal to number of categories
## Insufficient values in colour, number of colours should be equal to number of categories
## Insufficient values in colour, number of colours should be equal to number of categories
## Insufficient values in colour, number of colours should be equal to number of categories
plot4[[1]]

Auto$origin_fac <- factor(Auto$origin,
                          levels = c(1, 2, 3),
                          labels = c("American",
                                     "European",
                                     "Japanese"))

Fit a model regressing mpg on horsepower, displacement, acceleration, cylinders, and origin.

What do you notice about the standard errors of the cylinders levels compared to the other predictors?

The sd errors of cylinders is less than origin, but higher than horsepower, displacement, acceleration

## 
## Call:
## lm(formula = mpg ~ horsepower + displacement + acceleration + 
##     cylinders + origin_fac, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.7214  -3.0124  -0.3059   2.0035  16.1041 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        46.176620   2.744110  16.828  < 2e-16 ***
## horsepower         -0.103514   0.015974  -6.480 2.81e-10 ***
## displacement       -0.010969   0.009187  -1.194 0.233202    
## acceleration       -0.376941   0.113511  -3.321 0.000984 ***
## cylinders          -0.868888   0.418514  -2.076 0.038545 *  
## origin_facEuropean  0.914347   0.731276   1.250 0.211933    
## origin_facJapanese  3.324201   0.722704   4.600 5.75e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.343 on 385 degrees of freedom
## Multiple R-squared:  0.6951, Adjusted R-squared:  0.6904 
## F-statistic: 146.3 on 6 and 385 DF,  p-value: < 2.2e-16

Use the count() function to generate a table showing how many cars are in each cylinder level.

# Load the dplyr package
library(dplyr)

# Count the number of cars in each cylinder level
cylinder_counts <- Auto %>%
  count(cylinders)

# Print the resulting table
print(cylinder_counts)
##   cylinders   n
## 1         3   4
## 2         4 199
## 3         5   3
## 4         6  83
## 5         8 103

filter out cylinders=3 and cylinders=5 (extremely low values), keep cylinder numbers between 4 and 8

filtered_data <- Auto %>%
  filter(cylinders !=3, cylinders !=5)

new_cylinder_counts <- filtered_data %>%
  count(cylinders)

print(new_cylinder_counts)
##   cylinders   n
## 1         4 199
## 2         6  83
## 3         8 103

Look at the residual and qq plots for this model. What do you observe? Do any of the regression assumptions appear to be violated here? If so, which one(s)? Note that you can generate certain plots using the which option, i.e., plot(model_object, which=1) generates the residual plot, and plot(model_object, which=2) generates the qq-plot.

It seems like on the residual plot, the pattern is non-linear and the variance of errors seems to increase from left to right. Therefore, the linearity and equal variance of errors need to be adjusted. For linearity, adjust x variables log(x), for equal variance, adjust y variable log(y)

Auto <- Auto %>%
  mutate(new_cylinders = case_when(
    cylinders == 3 ~ NA_real_,
    cylinders == 5 ~ NA_real_,
    TRUE ~ as.numeric(cylinders)
  ))

# Create a new factor column 'new_cylinders_fac' with labels "4", "6", and "8"
Auto <- Auto %>%
  mutate(new_cylinders_fac = factor(new_cylinders, levels = c(4, 6, 8), labels = c("4", "6", "8")))

#factor origin in Auto
Auto$origin_fac <- factor(Auto$origin,
                          levels = c(1, 2, 3),
                          labels = c("American",
                                     "European",
                                     "Japanese"))

# Perform linear regression with the updated dataset
filtered_regression <- lm(mpg ~ horsepower + displacement + acceleration + new_cylinders_fac + origin_fac,data = Auto)

# Print the summary of the regression
summary(filtered_regression)
## 
## Call:
## lm(formula = mpg ~ horsepower + displacement + acceleration + 
##     new_cylinders_fac + origin_fac, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4290 -2.3113 -0.6687  1.7092 17.2243 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        44.504785   2.459636  18.094  < 2e-16 ***
## horsepower         -0.110501   0.015256  -7.243 2.49e-12 ***
## displacement       -0.012215   0.008551  -1.428 0.153977    
## acceleration       -0.375344   0.105918  -3.544 0.000444 ***
## new_cylinders_fac6 -4.755474   0.887332  -5.359 1.46e-07 ***
## new_cylinders_fac8 -2.972455   1.582911  -1.878 0.061175 .  
## origin_facEuropean -0.114659   0.698862  -0.164 0.869768    
## origin_facJapanese  2.964612   0.680534   4.356 1.71e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.014 on 377 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.7425, Adjusted R-squared:  0.7377 
## F-statistic: 155.3 on 7 and 377 DF,  p-value: < 2.2e-16
#plot residual plot
plot(filtered_regression, which=1)

#plot qq plot
plot(filtered_regression, which=2)

Generate a plot of mpg and displacement. Do you think a transformation of the displacement variable might be appropriate? If so, which one and why?

It seems like on the residual plot, the pattern is non-linear and the variance of errors seems to increase from left to right. Therefore, the linearity and equal variance of errors need to be adjusted. For linearity, adjust x variables log(x), for equal variance, adjust y variable log(y)

filtered_displacement_regression <- lm(mpg ~ displacement, data = Auto)

# Print the summary of the regression
summary(filtered_displacement_regression)
## 
## Call:
## lm(formula = mpg ~ displacement, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.9170  -3.0243  -0.5021   2.3512  18.6128 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  35.12064    0.49443   71.03   <2e-16 ***
## displacement -0.06005    0.00224  -26.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.635 on 390 degrees of freedom
## Multiple R-squared:  0.6482, Adjusted R-squared:  0.6473 
## F-statistic: 718.7 on 1 and 390 DF,  p-value: < 2.2e-16
#plot residual 
plot(filtered_displacement_regression, which=1)

#plot qq
plot(filtered_displacement_regression, which=2)

Transform the mpg and displacement predictor variable

filtered_log_displacement_regression <- lm(log(mpg) ~ log(displacement), data = Auto)

# Print the summary of the regression
summary(filtered_log_displacement_regression)
## 
## Call:
## lm(formula = log(mpg) ~ log(displacement), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.69075 -0.11098  0.00436  0.11706  0.78109 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        5.91367    0.08501   69.56   <2e-16 ***
## log(displacement) -0.54903    0.01649  -33.30   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1737 on 390 degrees of freedom
## Multiple R-squared:  0.7398, Adjusted R-squared:  0.7391 
## F-statistic:  1109 on 1 and 390 DF,  p-value: < 2.2e-16
#plot residual 
plot(filtered_log_displacement_regression,which=1)

#plot qq
plot(filtered_log_displacement_regression,which=2)

Then, add color to your plot for cylinders. Is an interaction term appropriate? Finally, replace cylinders to color by origin. Is an interaction term appropriate?

Interaction terms needed for origin

ggplot(Auto, aes(x=log(displacement), y=log(mpg), col=new_cylinders_fac))+
  geom_point()+
  geom_smooth(method="lm", se=F)+
  labs(x="log displacement",
       y="log MPG", col="cylinders")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(Auto, aes(x=log(displacement), y=log(mpg), col=origin_fac))+
  geom_point()+
  geom_smooth(method="lm", se=F)+
  labs(x="log displacement",
       y="log MPG", col="origins")
## `geom_smooth()` using formula = 'y ~ x'

Add interaction terms on log displacement and origin

filtered_log_displacement_regression_add_interaction <- lm(log(mpg) ~ log(displacement) * origin_fac, data = Auto)

# Print the summary of the regression
summary(filtered_log_displacement_regression_add_interaction)
## 
## Call:
## lm(formula = log(mpg) ~ log(displacement) * origin_fac, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62047 -0.10709  0.00205  0.10441  0.77754 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           6.20859    0.13253  46.846  < 2e-16 ***
## log(displacement)                    -0.60135    0.02437 -24.678  < 2e-16 ***
## origin_facEuropean                    0.10320    0.51210   0.202  0.84040    
## origin_facJapanese                   -1.33515    0.43693  -3.056  0.00240 ** 
## log(displacement):origin_facEuropean -0.04446    0.10843  -0.410  0.68205    
## log(displacement):origin_facJapanese  0.28063    0.09348   3.002  0.00286 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1702 on 386 degrees of freedom
## Multiple R-squared:  0.7528, Adjusted R-squared:  0.7496 
## F-statistic: 235.1 on 5 and 386 DF,  p-value: < 2.2e-16
#plot residual 
plot(filtered_log_displacement_regression_add_interaction,which=1)

#plot qq
plot(filtered_log_displacement_regression_add_interaction,which=2)

Generate a plot of mpg and acceleration. Do you think a transformation of the acceleration variable might be appropriate? If so, which one and why?

It seems like on the residual plot, the pattern is non-linear. Therefore, the linearity needs to be adjusted. For linearity, adjust x variables log(x).

filtered_acceleration_regression <- lm(mpg ~ acceleration, data = Auto)

# Print the summary of the regression
summary(filtered_acceleration_regression)
## 
## Call:
## lm(formula = mpg ~ acceleration, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.989  -5.616  -1.199   4.801  23.239 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.8332     2.0485   2.359   0.0188 *  
## acceleration   1.1976     0.1298   9.228   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.08 on 390 degrees of freedom
## Multiple R-squared:  0.1792, Adjusted R-squared:  0.1771 
## F-statistic: 85.15 on 1 and 390 DF,  p-value: < 2.2e-16
#plot residual 
plot(filtered_acceleration_regression, which=1)

#plot qq
plot(filtered_acceleration_regression, which=2)

Transform the acceleration predictor variable

filtered_log_acceleration_regression <- lm(mpg ~ log(acceleration), data = Auto)

# Print the summary of the regression
summary(filtered_log_acceleration_regression)
## 
## Call:
## lm(formula = mpg ~ log(acceleration), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.0234  -5.6231  -0.9787   4.5943  23.0872 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -27.834      5.373  -5.180 3.56e-07 ***
## log(acceleration)   18.801      1.966   9.565  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.033 on 390 degrees of freedom
## Multiple R-squared:   0.19,  Adjusted R-squared:  0.1879 
## F-statistic: 91.49 on 1 and 390 DF,  p-value: < 2.2e-16
#plot residual 
plot(filtered_log_acceleration_regression,which=1)

#plot qq
plot(filtered_log_acceleration_regression,which=2)

Then, add color to your plot for cylinders. Is an interaction term appropriate? Finally, replace cylinders to color by origin. Is an interaction term appropriate?

Interaction terms needed for both cylinders and origin

ggplot(Auto, aes(x=log(acceleration), y=mpg, col=new_cylinders_fac))+
  geom_point()+
  geom_smooth(method="lm", se=F)+
  labs(x="log acceleration",
       y="MPG", col="cylinders")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(Auto, aes(x=log(acceleration), y=mpg, col=origin_fac))+
  geom_point()+
  geom_smooth(method="lm", se=F)+
  labs(x="log acceleration",
       y="MPG", col="origins")
## `geom_smooth()` using formula = 'y ~ x'

add interaction terms on log acceleration and cylinders and log acceleration and origin

filtered_log_acceleration_regression_add_interaction <- lm(mpg ~ log(acceleration) * new_cylinders_fac + log(acceleration) * origin_fac, data = Auto)

# Print the summary of the regression
summary(filtered_log_acceleration_regression_add_interaction)
## 
## Call:
## lm(formula = mpg ~ log(acceleration) * new_cylinders_fac + log(acceleration) * 
##     origin_fac, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.7029  -2.5503  -0.3744   2.0779  18.7160 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            60.064     10.408   5.771 1.65e-08 ***
## log(acceleration)                     -11.524      3.732  -3.088 0.002167 ** 
## new_cylinders_fac6                     -4.979     13.704  -0.363 0.716594    
## new_cylinders_fac8                    -57.152     12.297  -4.648 4.65e-06 ***
## origin_facEuropean                    -60.415     13.376  -4.517 8.43e-06 ***
## origin_facJapanese                    -47.733     15.028  -3.176 0.001615 ** 
## log(acceleration):new_cylinders_fac6   -1.112      4.931  -0.225 0.821718    
## log(acceleration):new_cylinders_fac8   16.255      4.529   3.589 0.000375 ***
## log(acceleration):origin_facEuropean   21.688      4.784   4.533 7.82e-06 ***
## log(acceleration):origin_facJapanese   18.443      5.405   3.413 0.000714 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.353 on 375 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.6987, Adjusted R-squared:  0.6915 
## F-statistic: 96.64 on 9 and 375 DF,  p-value: < 2.2e-16
#plot residual 
plot(filtered_log_acceleration_regression_add_interaction,which=1)

#plot qq
plot(filtered_log_acceleration_regression_add_interaction,which=2)

Generate a plot of mpg and horsepower. Do you think a transformation of the horsepower variable might be appropriate? If so, which one and why?

It seems like on the residual plot, the pattern is non-linear and the variance of errors seems to increase from left to right. Therefore, the linearity and equal variance of errors need to be adjusted. For linearity, adjust x variables log(x), for equal variance, adjust y variable log(y)

filtered_horsepower_regression <- lm(mpg ~ horsepower, data = Auto)

# Print the summary of the regression
summary(filtered_horsepower_regression)
## 
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5710  -3.2592  -0.3435   2.7630  16.9240 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.935861   0.717499   55.66   <2e-16 ***
## horsepower  -0.157845   0.006446  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16
#plot residual 
plot(filtered_horsepower_regression, which=1)

#plot qq
plot(filtered_horsepower_regression, which=2)

Transform the mpg and the horsepower predictor variable

filtered_log_horsepower_regression <- lm(log(mpg) ~ log(horsepower), data = Auto)

# Print the summary of the regression
summary(filtered_log_horsepower_regression)
## 
## Call:
## lm(formula = log(mpg) ~ log(horsepower), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65230 -0.12176  0.00788  0.11631  0.63730 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      6.96065    0.12149   57.30   <2e-16 ***
## log(horsepower) -0.84185    0.02641  -31.88   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1793 on 390 degrees of freedom
## Multiple R-squared:  0.7227, Adjusted R-squared:  0.722 
## F-statistic:  1016 on 1 and 390 DF,  p-value: < 2.2e-16
#plot residual 
plot(filtered_log_horsepower_regression,which=1)

#plot qq
plot(filtered_log_horsepower_regression,which=2)

Then, add color to your plot for cylinders. Is an interaction term appropriate? Finally, replace cylinders to color by origin. Is an interaction term appropriate?

Interaction terms needed for both cylinders and origin

ggplot(Auto, aes(x=log(horsepower), y=log(mpg), col=new_cylinders_fac))+
  geom_point()+
  geom_smooth(method="lm", se=F)+
  labs(x="log horsepower",
       y="log MPG", col="cylinders")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(Auto, aes(x=log(horsepower), y=log(mpg), col=origin_fac))+
  geom_point()+
  geom_smooth(method="lm", se=F)+
  labs(x="log horsepower",
       y="log MPG", col="origins")
## `geom_smooth()` using formula = 'y ~ x'

add interaction terms on log horsepower and cylinders and log horsepower and origin

From the summary report, it seems like the interaction term on log horsepower and origin is not needed (Not significant).

filtered_log_horsepower_regression_add_interaction <- lm(log(mpg) ~ log(horsepower) * new_cylinders_fac + log(horsepower) * origin_fac, data = Auto)

# Print the summary of the regression
summary(filtered_log_horsepower_regression_add_interaction)
## 
## Call:
## lm(formula = log(mpg) ~ log(horsepower) * new_cylinders_fac + 
##     log(horsepower) * origin_fac, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38413 -0.09331 -0.00533  0.09281  0.66113 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         5.65931    0.47303  11.964  < 2e-16 ***
## log(horsepower)                    -0.53563    0.10720  -4.997 8.96e-07 ***
## new_cylinders_fac6                 -2.61657    0.65987  -3.965 8.78e-05 ***
## new_cylinders_fac8                  0.23332    0.63829   0.366 0.714912    
## origin_facEuropean                  0.22792    0.53524   0.426 0.670480    
## origin_facJapanese                  0.01467    0.53267   0.028 0.978037    
## log(horsepower):new_cylinders_fac6  0.52071    0.14488   3.594 0.000369 ***
## log(horsepower):new_cylinders_fac8 -0.09887    0.13670  -0.723 0.469947    
## log(horsepower):origin_facEuropean -0.05791    0.12129  -0.477 0.633343    
## log(horsepower):origin_facJapanese  0.01833    0.12072   0.152 0.879394    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1515 on 375 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.8082, Adjusted R-squared:  0.8036 
## F-statistic: 175.6 on 9 and 375 DF,  p-value: < 2.2e-16
#plot residual 
plot(filtered_log_horsepower_regression_add_interaction,which=1)

#plot qq
plot(filtered_log_horsepower_regression_add_interaction,which=2)

Decide if you want to use a transformation on displacement, acceleration, and/or horsepower, or interaction term(s). Consider the implications of estimate interpretations. Fit the model with the additional terms that you choose, and generate the residual and qq plots. What do you notice compared to what you saw in #2?

Detailed interpretation for the above regression results:

log(horsepower): On average, a one-unit increase in log(horsepower) is associated with a decrease of 17.1803 units in mpg, all else constant.

new_cylinders_fac6: On average, new cylinders 6’ mpg is 63.2509 lower than new cylinders 4’ mpg, holding all else constant.

new_cylinders_fac8:On average, new cylinders 8’ mpg is 72.8153 lower than new cylinders 4’ mpg, holding all else constant.

log(displacement): On average, a one-unit increase in log(displacement) is associated with a decrease of 4.4958 units in mpg, all else constant.

origin_facEuropean: On average, European cars’ mpg is 46.1726 lower than American cars’ mpg, holding all else constant.

origin_facJapanese: On average, Japanese cars’ mpg is 1.0518 lower than American cars’ mpg, holding all else constant. it’s not statistically significant (p-value > 0.05).

log(acceleration): On average, a one-unit increase in log(acceleration) is associated with a decrease of 17.4285 units in mpg, all else constant.

log(horsepower):new_cylinders_fac6: Per increase in log(horsepower), mpg increase by an additional 10.4851, on average, all else held constant, for new cylinders 6 compared to new cylinders 4.

log(horsepower):new_cylinders_fac8: Per increase in log(horsepower), mpg increase by an additional 7.3571, on average, all else held constant, for new cylinders 8 compared to new cylinders 4.

log(displacement):origin_facEuropean: Per increase in log(displacement), mpg increase by an additional 1.7288, on average, all else held constant, for European cars compared to American cars. it’s not statistically significant (p-value > 0.05).

log(displacement):origin_facJapanese: Per increase in log(displacement), mpg decrease by an additional 0.9837, on average, all else held constant, for Japanese cars compared to American cars. it’s not statistically significant (p-value > 0.05).

new_cylinders_fac6:log(acceleration): Per increase in log(acceleration), mpg increases by an additional 4.7260, on average, all else held constant, for new cylinders 6 compared to new cylinders 4. it’s not statistically significant (p-value > 0.05).

new_cylinders_fac8:log(acceleration): Per increase in log(acceleration), mpg increases by an additional 13.5197, on average, all else held constant, for new cylinders 8 compared to new cylinders 4.

origin_facEuropean:log(acceleration): Per increase in log(acceleration), mpg increase by an additional 13.1480, on average, all else held constant, for European cars compared to American cars.

log(acceleration):origin_facJapanese: Per increase in log(acceleration), mpg decrease by an additional2.5459, on average, all else held constant, for Japanese cars compared to American cars. it’s not statistically significant (p-value > 0.05).

for residual plot, the linearity looks good, the equal variance slightly off the mark (increase from left to right).

for qq plot, the normality of errors looks good.

final_regression_1 <- lm(mpg ~ log(horsepower)*new_cylinders_fac + log(displacement)*origin_fac + log(acceleration)*new_cylinders_fac + log(acceleration)*origin_fac,data = Auto)

# Print the summary of the regression
summary(final_regression_1)
## 
## Call:
## lm(formula = mpg ~ log(horsepower) * new_cylinders_fac + log(displacement) * 
##     origin_fac + log(acceleration) * new_cylinders_fac + log(acceleration) * 
##     origin_fac, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.027  -2.166  -0.440   1.749  18.454 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          173.5642    14.2932  12.143  < 2e-16 ***
## log(horsepower)                      -17.1803     2.1723  -7.909 3.05e-14 ***
## new_cylinders_fac6                   -63.2509    26.7229  -2.367  0.01845 *  
## new_cylinders_fac8                   -72.8153    22.7244  -3.204  0.00147 ** 
## log(displacement)                     -4.4958     1.7677  -2.543  0.01139 *  
## origin_facEuropean                   -46.1726    17.2486  -2.677  0.00776 ** 
## origin_facJapanese                    -1.0518    29.1628  -0.036  0.97125    
## log(acceleration)                    -17.4285     3.1695  -5.499 7.15e-08 ***
## log(horsepower):new_cylinders_fac6    10.4851     3.9525   2.653  0.00833 ** 
## log(horsepower):new_cylinders_fac8     7.3571     3.2250   2.281  0.02310 *  
## log(displacement):origin_facEuropean   1.7288     2.8063   0.616  0.53826    
## log(displacement):origin_facJapanese  -0.9837     3.3437  -0.294  0.76878    
## new_cylinders_fac6:log(acceleration)   4.7260     4.8312   0.978  0.32860    
## new_cylinders_fac8:log(acceleration)  13.5197     4.1685   3.243  0.00129 ** 
## origin_facEuropean:log(acceleration)  13.1480     4.1568   3.163  0.00169 ** 
## origin_facJapanese:log(acceleration)   2.5459     5.9972   0.425  0.67143    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.584 on 369 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.7991, Adjusted R-squared:  0.7909 
## F-statistic: 97.83 on 15 and 369 DF,  p-value: < 2.2e-16
#plot residual 
plot(final_regression_1,which=1)

#plot qq
plot(final_regression_1,which=2)

Fit the model using log(mpg) as the outcome. Generate the residual and qq plots and comment on the difference(s) that you observe. Consider the implications for model interpretation. Do you think the transformation of the outcome variable is useful here?

I observe that more interaction terms become insignificant after transform mpg to log(mpg)

for residual plot, the linearity and equal variance looks good.

for qq plot, the normality of errors looks good.

I think the transformation of the outcome variable is useful here.

final_regression_2 <- lm(log(mpg) ~ log(horsepower)*new_cylinders_fac + log(displacement)*origin_fac + log(acceleration)*new_cylinders_fac + log(acceleration)*origin_fac,data = Auto)

# Print the summary of the regression
summary(final_regression_2)
## 
## Call:
## lm(formula = log(mpg) ~ log(horsepower) * new_cylinders_fac + 
##     log(displacement) * origin_fac + log(acceleration) * new_cylinders_fac + 
##     log(acceleration) * origin_fac, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32355 -0.09703 -0.00723  0.08505  0.67430 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           8.52470    0.55998  15.223  < 2e-16 ***
## log(horsepower)                      -0.58203    0.08511  -6.839 3.32e-11 ***
## new_cylinders_fac6                   -1.43049    1.04695  -1.366  0.17266    
## new_cylinders_fac8                   -0.18524    0.89030  -0.208  0.83530    
## log(displacement)                    -0.18518    0.06926  -2.674  0.00783 ** 
## origin_facEuropean                   -1.53140    0.67576  -2.266  0.02402 *  
## origin_facJapanese                   -0.42233    1.14254  -0.370  0.71186    
## log(acceleration)                    -0.63214    0.12417  -5.091 5.70e-07 ***
## log(horsepower):new_cylinders_fac6    0.27227    0.15485   1.758  0.07953 .  
## log(horsepower):new_cylinders_fac8   -0.15011    0.12635  -1.188  0.23558    
## log(displacement):origin_facEuropean  0.04389    0.10995   0.399  0.68995    
## log(displacement):origin_facJapanese  0.02180    0.13100   0.166  0.86791    
## new_cylinders_fac6:log(acceleration)  0.02377    0.18928   0.126  0.90012    
## new_cylinders_fac8:log(acceleration)  0.28929    0.16331   1.771  0.07732 .  
## origin_facEuropean:log(acceleration)  0.45279    0.16286   2.780  0.00571 ** 
## origin_facJapanese:log(acceleration)  0.13079    0.23496   0.557  0.57811    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1404 on 369 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.8379, Adjusted R-squared:  0.8313 
## F-statistic: 127.1 on 15 and 369 DF,  p-value: < 2.2e-16
#plot residual 
plot(final_regression_2,which=1)

#plot qq
plot(final_regression_2,which=2)

plot(final_regression_2)